library(Seurat)
library(SnapATAC)
Loading required package: Matrix
Loading required package: rhdf5
library(tidyverse)
[30m── [1mAttaching packages[22m ───────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──[39m
[30m[32m✔[30m [34mggplot2[30m 3.2.1 [32m✔[30m [34mpurrr [30m 0.3.3
[32m✔[30m [34mtibble [30m 2.1.3 [32m✔[30m [34mdplyr [30m 0.8.3
[32m✔[30m [34mtidyr [30m 1.0.0 [32m✔[30m [34mstringr[30m 1.4.0
[32m✔[30m [34mreadr [30m 1.3.1 [32m✔[30m [34mforcats[30m 0.4.0[39m
[30m── [1mConflicts[22m ──────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[30m [34mtidyr[30m::[32mexpand()[30m masks [34mMatrix[30m::expand()
[31m✖[30m [34mdplyr[30m::[32mfilter()[30m masks [34mstats[30m::filter()
[31m✖[30m [34mdplyr[30m::[32mlag()[30m masks [34mstats[30m::lag()
[31m✖[30m [34mtidyr[30m::[32mpack()[30m masks [34mMatrix[30m::pack()
[31m✖[30m [34mtidyr[30m::[32munpack()[30m masks [34mMatrix[30m::unpack()[39m
library(DescTools) # 4 AUC function
Registered S3 method overwritten by 'DescTools':
method from
reorder.factor gdata
library(glue)
Attaching package: ‘glue’
The following object is masked from ‘package:dplyr’:
collapse
library(ggalluvial) # 4 river plot
library(ggpubr)
Loading required package: magrittr
Attaching package: ‘magrittr’
The following object is masked from ‘package:purrr’:
set_names
The following object is masked from ‘package:tidyr’:
extract
source("~/multiOmic_benchmark/utils.R")
gg_color_hue <- function(n) {
hues = seq(15, 375, length = n + 1)
hcl(h = hues, l = 65, c = 100)[1:n]
}
## Make output directory
outdir <- "~/multiOmic_benchmark/report/output/20191113_labelTransferEDA_F74_v2/"
ifelse(!dir.exists(outdir), dir.create(outdir), FALSE)
[1] FALSE
# model.cca <- readRDS("~/models/modelCCA_union_hvg_F74_SCElist_20191113.RDS")
# model.liger <- readRDS("~/models/modelLiger_union_hvg_F74_SCElist_20191113.RDS")
# model.conos <- readRDS("~/models/modelConos_union_hvg_F74_SCElist_20191113.RDS")
seu.cca <- readRDS("~/models/labelTransferCCA_union_hvg_F74_SCElist_20191119.RDS")
seu.liger <- readRDS("~/models/labelTransferLiger_union_hvg_F74_SCElist_20191119.RDS")
seu.conos <- readRDS("~/models/labelTransferConos_union_hvg_F74_SCElist_20191119.RDS")
integrate_features <- scan("~/intFeatures_union_hvg_2000_F74_SCElist_20191113.txt", what='')
Read 10603 items
int.list <- list(CCA=seu.cca, Liger=seu.liger, Conos=seu.conos)
# ## Make method color palette
# method.palette <- brewer_palette_4_values(names(int.list), "Set1")
Visualize label transfer on original ATAC data (embedded SnapATAC bins)
## Load original data
orig.ATAC <- readRDS("~/my_data/cellranger-atac110_count_30439_WSSS8038360_GRCh38-1_1_0.snapATAC.RDS")
sce.list <- readRDS("~/my_data/integrated_thymus/F74_SCElist_20191119.RDS")
orig.RNA <- sce.list$RNA
Loading required package: SingleCellExperiment
Loading required package: SummarizedExperiment
Loading required package: GenomicRanges
Loading required package: stats4
Loading required package: BiocGenerics
Loading required package: parallel
Attaching package: ‘BiocGenerics’
The following objects are masked from ‘package:parallel’:
clusterApply, clusterApplyLB, clusterCall, clusterEvalQ, clusterExport, clusterMap, parApply,
parCapply, parLapply, parLapplyLB, parRapply, parSapply, parSapplyLB
The following objects are masked from ‘package:dplyr’:
combine, intersect, setdiff, union
The following object is masked from ‘package:Matrix’:
which
The following objects are masked from ‘package:stats’:
IQR, mad, sd, var, xtabs
The following objects are masked from ‘package:base’:
anyDuplicated, append, as.data.frame, basename, cbind, colnames, dirname, do.call, duplicated,
eval, evalq, Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply, match,
mget, order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce, rownames,
sapply, setdiff, sort, table, tapply, union, unique, unsplit, which, which.max, which.min
Loading required package: S4Vectors
Attaching package: ‘S4Vectors’
The following objects are masked from ‘package:dplyr’:
first, rename
The following object is masked from ‘package:tidyr’:
expand
The following object is masked from ‘package:Matrix’:
expand
The following object is masked from ‘package:base’:
expand.grid
Loading required package: IRanges
Attaching package: ‘IRanges’
The following objects are masked from ‘package:glue’:
collapse, trim
The following objects are masked from ‘package:dplyr’:
collapse, desc, slice
The following object is masked from ‘package:purrr’:
reduce
Loading required package: GenomeInfoDb
Loading required package: Biobase
Welcome to Bioconductor
Vignettes contain introductory material; view with 'browseVignettes()'. To cite Bioconductor,
see 'citation("Biobase")', and for packages 'citation("pkgname")'.
Loading required package: DelayedArray
Loading required package: matrixStats
Attaching package: ‘matrixStats’
The following objects are masked from ‘package:Biobase’:
anyMissing, rowMedians
The following object is masked from ‘package:dplyr’:
count
Loading required package: BiocParallel
Attaching package: ‘DelayedArray’
The following objects are masked from ‘package:matrixStats’:
colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
The following object is masked from ‘package:purrr’:
simplify
The following objects are masked from ‘package:base’:
aperm, apply, rowsum
Attaching package: ‘SummarizedExperiment’
The following object is masked from ‘package:Seurat’:
Assays
## Make SeuratObjects
atac.seu <- snapToSeurat(
obj=orig.ATAC,
eigs.dims=1:20,
norm=TRUE,
scale=TRUE
)
Epoch: checking input parameters ...
Non-unique features (rownames) present in the input matrix, making uniquePerforming log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Centering and scaling data matrix
|
| | 0%
|
|=== | 3%
|
|====== | 6%
|
|========= | 9%
|
|============ | 12%
|
|=============== | 15%
|
|================== | 18%
|
|===================== | 21%
|
|======================== | 24%
|
|=========================== | 26%
|
|============================== | 29%
|
|================================= | 32%
|
|==================================== | 35%
|
|======================================= | 38%
|
|========================================== | 41%
|
|============================================= | 44%
|
|================================================ | 47%
|
|==================================================== | 50%
|
|======================================================= | 53%
|
|========================================================== | 56%
|
|============================================================= | 59%
|
|================================================================ | 62%
|
|=================================================================== | 65%
|
|====================================================================== | 68%
|
|========================================================================= | 71%
|
|============================================================================ | 74%
|
|=============================================================================== | 76%
|
|================================================================================== | 79%
|
|===================================================================================== | 82%
|
|======================================================================================== | 85%
|
|=========================================================================================== | 88%
|
|============================================================================================== | 91%
|
|================================================================================================= | 94%
|
|==================================================================================================== | 97%
|
|=======================================================================================================| 100%
atac.seu <- RenameCells(atac.seu, new.names = orig.ATAC@metaData$barcode)
## Add cell type predictions
getPredictedLabels <- function(seu.int, int.name, id.col="predicted.id", score.col="score"){
pred.df <- seu.int$ATAC@meta.data[,c(id.col, score.col), drop=F]
rownames(pred.df) <- str_remove(rownames(pred.df), "^ATAC_")
colnames(pred.df) <- c(str_c("predicted.id", "_", int.name), str_c("score", "_", int.name))
pred.df
}
pred.cca <- getPredictedLabels(seu.cca, "CCA", score.col = "prediction.score.max")
pred.liger <- getPredictedLabels(seu.liger, "Liger")
pred.conos <- getPredictedLabels(seu.conos, "Conos")
if (all(rownames(pred.conos) == rownames(pred.cca)) & all(rownames(pred.conos) == rownames(pred.liger))) {
atac.seu <- AddMetaData(atac.seu, metadata = cbind(pred.cca, pred.liger, pred.conos))
} else {
stop("Non corresponding cell names")
}
## make cell type palette
cell.types <- levels(seu.cca$RNA$annotation)
cell.type.pal <- setNames(sample(gg_color_hue(length(cell.types) )), cell.types)
atac.seu <- RunUMAP(atac.seu, reduction = "SnapATAC", reduction.name = "umap.snap", dims=1:20)
The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
This message will be shown once per session15:59:03 UMAP embedding parameters a = 0.9922 b = 1.112
15:59:03 Read 5793 rows and found 20 numeric columns
15:59:03 Using Annoy for neighbor search, n_neighbors = 30
15:59:03 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
15:59:04 Writing NN index file to temp file /tmp/RtmpsI55IG/file18a7368b8bea
15:59:04 Searching Annoy index using 1 thread, search_k = 3000
15:59:06 Annoy recall = 100%
15:59:07 Commencing smooth kNN distance calibration using 1 thread
15:59:09 Initializing from normalized Laplacian + noise
15:59:09 Commencing optimization for 500 epochs, with 237344 positive edges
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
15:59:24 Optimization finished
ggpubr::ggarrange(
plotlist = list(
DimPlot(atac.seu, reduction = "umap.snap", group.by = "predicted.id_CCA" , cols=cell.type.pal, label=TRUE, repel=TRUE) + ggtitle("CCA"),
DimPlot(atac.seu, reduction = "umap.snap", group.by = "predicted.id_Liger", cols=cell.type.pal, label=TRUE, repel=TRUE) + ggtitle("Liger"),
DimPlot(atac.seu, reduction = "umap.snap", group.by = "predicted.id_Conos", cols=cell.type.pal, label=TRUE, repel=TRUE) + ggtitle("Conos")
),
common.legend = TRUE, ncol=3, nrow=1
) +
ggsave(paste0(outdir, "umap_labels.png"), width=16, height = 8)
Using `as.character()` on a quosure is deprecated as of rlang 0.3.0.
Please use `as_label()` or `as_name()` instead.
[90mThis warning is displayed once per session.[39m
pl <- DimPlot(atac.seu, reduction = "umap.snap", group.by = "predicted.id_CCA", cols=cell.type.pal, label=TRUE, repel=TRUE) + ggtitle("CCA")
plotly::ggplotly(pl)
geom_GeomTextRepel() has yet to be implemented in plotly.
If you'd like to see this geom implemented,
Please open an issue with your example code at
https://github.com/ropensci/plotly/issues
pl <- DimPlot(atac.seu, reduction = "umap.snap", group.by = "predicted.id_Conos", cols=cell.type.pal, label=TRUE, repel=TRUE) + ggtitle("Conos")
plotly::ggplotly(pl)
geom_GeomTextRepel() has yet to be implemented in plotly.
If you'd like to see this geom implemented,
Please open an issue with your example code at
https://github.com/ropensci/plotly/issues
orig.RNA.seu <- as.Seurat(orig.RNA)
orig.RNA.seu <- FindVariableFeatures(orig.RNA.seu)
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
orig.RNA.seu <- ScaleData(orig.RNA.seu)
Centering and scaling data matrix
|
| | 0%
|
|==================================================== | 50%
|
|=======================================================================================================| 100%
orig.RNA.seu <- RunPCA(orig.RNA.seu)
PC_ 1
Positive: TRBC2, TRBC1, HMGA1, HIST1H1C, HIST1H3H, ITM2A, HIST1H2BJ, SMIM24, TRAV13-2, FXYD2
TRBV7-2, PCGF5, HIST1H2BH, IL32, HIST1H2BN, CHAC1, RASD1, TRBV27, TRAV13-1, TRAV8-2
TRAV8-4, PTPN6, SELL, HIST1H2BG, TAGAP, TRDC, TRAV38-2DV8, TRAV29DV5, TRBV9, TRAV41
Negative: CALD1, COL5A2, COL6A1, COL6A2, SPARC, THY1, DCN, COL3A1, NFIB, SPARCL1
TSHZ2, CPE, PLAC9, NID1, FKBP10, PTN, FLRT2, MAP1B, EFEMP2, BGN
CXCL12, RBP1, LAMB1, AHNAK, COL1A1, COL5A1, FSTL1, LUM, LAMA4, MDK
PC_ 2
Positive: SFRP1, NTRK2, PLAT, ISLR, NRK, SCARA5, ASPN, OSR1, OLFML3, MXRA8
CAPN6, PTPRD, PLP1, TMEFF2, CREB3L1, DKK3, CERCAM, MMP2, EBF2, SMOC2
CDO1, COL12A1, PDGFRA, LRRC17, THBS2, HTRA3, SFRP2, ANGPTL1, MAB21L1, MXRA5
Negative: MKI67, CDK1, NUSAP1, TOP2A, CCNA2, RRM2, UBE2C, BIRC5, KIFC1, TYMS
UBE2T, AURKB, CENPF, CENPM, CDCA8, TACC3, NCAPG, TPX2, ASF1B, CDKN3
GTSE1, CDCA3, HJURP, SPC25, MAD2L1, CDC20, PLK1, DLGAP5, NUF2, KIF22
PC_ 3
Positive: CCNA2, NUF2, GTSE1, CDCA8, UBE2T, AURKB, PBK, CDK1, NCAPG, NDC80
KIFC1, CDCA3, HJURP, MAD2L1, SPC25, PLK1, KIF15, DEPDC1B, BIRC5, CDCA5
CKS1B, RRM2, DLGAP5, HMMR, CENPA, KIF22, KIF20A, CDCA2, CENPF, KIF2C
Negative: HLA-DRB5, HLA-DRA, TYROBP, HLA-DRB1, HLA-DPA1, HLA-DPB1, HLA-DQA1, C1QC, C1QB, RNASE1
HLA-DQB1, A2M, C1QA, CSF1R, STAB1, HCK, HLA-DMB, FOLR2, SAMHD1, LYZ
MS4A7, SPI1, CD36, MS4A6A, CYBB, TMEM176B, CD74, IGSF6, MPEG1, MS4A4A
PC_ 4
Positive: GYPA, GYPB, NFE2, GYPE, ANK1, SLC4A1, RHAG, AHSP, DMTN, KLF1
GATA1, TMOD1, TMEM56, HBZ, HBG1, GMPR, C17orf99, SMIM5, HBQ1, TSPO2
ALAS2, PHOSPHO1, CR1L, TRIM58, HBM, EPB42, RHD, RHCE, SPTA1, SMIM1
Negative: TMSB10, TRBC2, CCL21, IL32, CXCL13, TRBC1, APLNR, CCL19, MIF, HMGB1
COX4I2, COL4A1, COL15A1, MADCAM1, FDCSP, CCL17, NTS, TNC, CDH5, FABP4
CAV1, COL4A2, CRIP2, RGS5, KLRB1, MYLK, IFITM1, IL33, PAPLN, HPN
PC_ 5
Positive: APLNR, CDH5, CAV1, COL15A1, CRIP2, COL4A1, CCL21, KDR, CLDN5, MADCAM1
FABP4, IL33, COL4A2, CXCL13, TM4SF1, PODXL, CCL19, COX4I2, ESAM, BCAM
NTS, PAPLN, SPNS2, MYLK, C8orf4, ADGRF5, TM4SF18, TGM2, RP11-536O18.1, CXorf36
Negative: CSF1R, MS4A4A, CYBB, PLD4, MS4A6A, CD163, HCK, IGSF6, FOLR2, ADAP2
CD86, MS4A7, MARCH1, MRC1, F13A1, MPEG1, CD14, SPI1, HLA-DMB, GPR34
CD33, TGFBI, CLEC7A, TIMD4, CEBPA, SIGLEC1, CSF2RA, SLC15A3, LY86, AGR2
orig.RNA.seu <- RunUMAP(orig.RNA.seu, dims=1:40)
16:03:30 UMAP embedding parameters a = 0.9922 b = 1.112
16:03:30 Read 8321 rows and found 40 numeric columns
16:03:30 Using Annoy for neighbor search, n_neighbors = 30
16:03:30 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
16:03:31 Writing NN index file to temp file /tmp/RtmpsI55IG/file18a76093336b
16:03:31 Searching Annoy index using 1 thread, search_k = 3000
16:03:34 Annoy recall = 100%
16:03:35 Commencing smooth kNN distance calibration using 1 thread
16:03:37 Initializing from normalized Laplacian + noise
16:03:37 Commencing optimization for 500 epochs, with 367644 positive edges
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
16:03:58 Optimization finished
plotly::ggplotly(DimPlot(orig.RNA.seu, group.by="annotation"))
Quantifies the uncertainty of the prediction. Calculated differently for every method, but used to define which cells are “unassigned”.
orig.composition <- orig.RNA$annotation
orig.frac <- table(orig.composition)/length(orig.composition)
orig.frac.df <- data.frame(orig.frac) %>%
dplyr::rename(predicted.id=orig.composition, frac.label=Freq) %>%
mutate(method="original.RNA")
score_cols <- str_subset(colnames(atac.seu@meta.data), 'score_')
label_cols <- str_subset(colnames(atac.seu@meta.data), 'predicted.id_')
pred.labels.df <- imap(list(CCA=pred.cca, Liger=pred.liger, Conos=pred.conos), ~
rownames_to_column(.x, "cell") %>%
rename_all(funs(str_remove(., str_c("_",.y)))) %>%
mutate(method=.y)
) %>%
purrr::reduce(bind_rows) %>%
mutate(score=ifelse(is.na(score), 0, score))
funs() is soft deprecated as of dplyr 0.8.0
Please use a list of either functions or lambdas:
# Simple named list:
list(mean = mean, median = median)
# Auto named with `tibble::lst()`:
tibble::lst(mean, median)
# Using lambdas
list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
[90mThis warning is displayed once per session.[39mbinding character and factor vector, coercing into character vector
predict_score_hist <-
pred.labels.df %>%
ggplot(aes(score, fill=method)) +
geom_histogram(position="identity", alpha=0.8, bins=40) +
facet_grid(method ~.) +
scale_fill_brewer(palette="Set1") +
xlab("Label prediction score") +
theme_bw(base_size = 16) +
theme(legend.position = "top")
cutoffs <- seq(0,1,0.05)
predict_score_cumedist <-
pred.labels.df %>%
group_by(method) %>%
mutate(bins=cut(score, breaks = cutoffs)) %>%
mutate(score=as.numeric(str_remove_all(as.character(bins), ".+,|]"))) %>%
ggplot(aes(score, color=method)) +
stat_ecdf(size=0.8, alpha=0.7) +
scale_color_brewer(palette = "Set1") +
ylab("Fraction of unassigned cells") +
xlab("Prediction score cutoff") +
theme_bw(base_size = 16) +
xlim(0,1) +
coord_fixed() +
guides(color="none")
ggpubr::ggarrange(predict_score_hist, predict_score_cumedist, common.legend = TRUE, widths = c(0.8, 1.2),
labels=c("A", "B")) +
ggsave(paste0(outdir, "prediction_score_distribution.png"), height = 6, width = 10)
Removed 47 rows containing non-finite values (stat_ecdf).
ggpubr::ggarrange(
plotlist = list(
FeaturePlot(atac.seu, reduction = "umap.snap", feature = "score_CCA" , coord.fixed = TRUE) + ggtitle("CCA"),
FeaturePlot(atac.seu, reduction = "umap.snap", feature = "score_Liger", coord.fixed = TRUE) + ggtitle("Liger"),
FeaturePlot(atac.seu, reduction = "umap.snap", feature = "score_Conos", coord.fixed = TRUE) + ggtitle("Conos")
),
common.legend = TRUE, ncol=3, nrow=1
) +
ggsave(paste0(outdir, "prediction_score_umaps.png"), height = 7, width=14)
Compare cell type fractions (w uncertainty)
orig.rank.df <- orig.frac.df %>%
mutate(orig.rank=dense_rank(frac.label)) %>%
select(orig.rank, predicted.id) %>%
distinct() %>%
arrange(orig.rank) %>%
column_to_rownames("predicted.id")
pred.labels.df %>%
group_by(method) %>%
drop_na() %>%
mutate(tot.cells=n()) %>%
ungroup() %>%
group_by(method, predicted.id) %>%
summarise(tot.label = n(), tot.cells = max(tot.cells), mean.score=mean(score)) %>%
mutate(frac.label=tot.label/tot.cells) %>%
bind_rows(orig.frac.df) %>%
mutate(orig.rank = orig.rank.df[predicted.id,]) %>%
mutate(predicted.id=factor(predicted.id, levels=rownames(orig.rank.df)))%>%
# select(method, predicted.id, frac.label) %>%
# distinct() %>%
ggplot(aes(predicted.id, frac.label, fill=mean.score, color=mean.score)) +
geom_point(size=2) +
geom_col(width=0.05) +
coord_flip() +
# geom_line(aes(group=method)) +
facet_wrap(method~., nrow=1, ncol=4, scales="free_x") +
scale_color_viridis_c() +
scale_fill_viridis_c() +
ylab("Fraction of cells") +
theme_bw(base_size = 16) +
ggsave(paste0(outdir, "cell_type_composition_bars.png"), width = 15, height = 7)
binding character and factor vector, coercing into character vector
Calculate which fractions of NNs in bin based graph of ATAC cells have the same annotation
k = 30
atac.seu <- FindNeighbors(atac.seu, assay = "ATAC", reduction = "SnapATAC", dims = 1:15, k.param = k)
Computing nearest neighbor graph
Computing SNN
atac.nn.list <- getNNlist(atac.seu)
knn.score.CCA <- test.knn(atac.nn.list, setNames(pred.cca$predicted.id_CCA, rownames(pred.cca)))
p-value will be approximate in the presence of ties
knn.score.conos <- test.knn(atac.nn.list, setNames(pred.conos$predicted.id_Conos, rownames(pred.conos)))
p-value will be approximate in the presence of ties
knn.score.liger <- test.knn(atac.nn.list, setNames(pred.liger$predicted.id_Liger, rownames(pred.liger)))
p-value will be approximate in the presence of ties
knn_score_df <-
list(CCA=knn.score.CCA, conos=knn.score.conos, liger=knn.score.liger) %>%
imap( ~ data.frame(KNN_score = .x$KNN_score, D=.x$D, p.val=.x$p.val, method=.y)) %>%
# imap( ~ data.frame(KNN_score = .x$KNN_score, cell= names(.x$KNN_score), D=.x$D, p.val=.x$p.val, method=.y)) %>%
purrr::reduce(bind_rows) %>%
dplyr::mutate(KNN_score=ifelse(is.na(KNN_score), 0, KNN_score)) %>%
mutate(data="true")
Unequal factor levels: coercing to characterbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vector
knn_score_null_df <-
list(CCA=knn.score.CCA, conos=knn.score.conos, liger=knn.score.liger) %>%
imap( ~ data.frame(KNN_score = .x$null, D=.x$D, p.val=.x$p.val, method=.y)) %>%
# imap( ~ data.frame(KNN_score = .x$KNN_score, cell= names(.x$KNN_score), D=.x$D, p.val=.x$p.val, method=.y)) %>%
purrr::reduce(bind_rows) %>%
dplyr::mutate(KNN_score=ifelse(is.na(KNN_score), 0, KNN_score)) %>%
mutate(data="null")
Unequal factor levels: coercing to characterbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vector
bind_rows(knn_score_df, knn_score_null_df) %>%
ggplot(aes(KNN_score, color=method)) +
stat_ecdf( aes(alpha=data), size=1) +
# stat_ecdf(data=. %>% filter(data=="true"), size=1) +
facet_grid(method~.) +
scale_alpha_discrete( range=c(0.5,1), name="") +
scale_color_brewer(palette = "Set1") +
geom_text(data=. %>% distinct(method, D, p.val),
x=1, y=0.05, hjust=1,
aes(label=glue("KNN score = {round(D, 3)}, p.value: {p.val}"), y=c(0.90, 0.95, 1))) +
theme_bw(base_size = 16) +
ylab("ECDF") + xlab("Fraction of KNNs with shared label") +
ggsave(paste(outdir,"KNN_score_ecdf_unionHVG.png"), height = 6, width=7)
Using alpha for a discrete variable is not advised.
full_join(pred.labels.df, knn_score_df) %>%
ggplot(aes(KNN_score, color=method)) +
stat_ecdf() +
facet_wrap("predicted.id") +
scale_color_brewer(palette = "Set1") +
coord_fixed()
Joining, by = c("cell", "method")
Taking markers from Fig. S2 of JP’s manuscript
thymus.markers <- c("PTPRC", "CD3G", "TYROBP","CD19","HOXA9",'FXYD2',"SH3TC1","CCR9","CD8A", "CD8B","PDCD1", "CRTAM","CD40LG","CCR6","FOXP3","SOX13","ZNF683","KLRD1","TNFSF11","VPREB1","MS4A1", "CLEC9A", "CLEC10A", "LAMP3", "IL3RA", "FCGR3B", "C2","TPSB2",
'ITGA2B',"GYPA", "CDH5", "RGS5","CDH1", "PDGFRA","CRABP1")
# pbmc.markers <- c("CD79A", "MS4A1", "CD8A", "CD8B", "LYZ")
# thymus.markers <- list(Fb=c("PDGFRA", "COLEC11", "FBN1", "PI16"),
# VSMC=c("PDGFRB", 'ACTA2', "RGS5"),
# Endo=c("PECAM1", "CDH5","LYVE1"),
# TEC = c("EPCAM", "FOXN1", "CCL25", "CCL19")
# )
thymus.markers.df <- imap(thymus.markers, ~ data.frame(gene=.x, cell.type.class=.y)) %>%
purrr::reduce(bind_rows)
marker.access.df <- atac.seu@assays$ACTIVITY@data[intersect(thymus.markers, rownames(atac.seu@assays$ACTIVITY)),] %>%
as.matrix() %>%
reshape2::melt(varnames=c("gene", "cell"), value.name="log.counts") %>%
full_join(rownames_to_column(atac.seu@meta.data[, label_cols], "cell")) %>%
# full_join(thymus.markers.df) %>%
pivot_longer(cols=label_cols, names_to = "method", values_to = "predicted.id") %>%
dplyr::mutate(method=str_remove(method,".+_")) %>%
filter(method %in% c("CCA", "Liger", "Conos"))
ordered_cell_types <- c("DN", "DP (Q)", "DP (P)", "SP", "NK", "ILC3", "DC", "Mac", "Ery", "Fib")
markers_pl <-
marker.access.df %>%
mutate(predicted.id = case_when(str_detect(predicted.id, "CD8") ~ "CD8+T",
# str_detect(predicted.id, "CD4") ~ "CD4+T",
TRUE ~ predicted.id
)
) %>%
mutate(predicted.id=factor(predicted.id, levels = ordered_cell_types)) %>%
group_by(method, predicted.id, gene) %>%
dplyr::mutate(frac.cells=sum(log.counts > 0)/n()) %>%
# filter(method=="CCA") %>%
ungroup() %>%
ggplot( aes( gene, predicted.id)) +
geom_point(aes(size=frac.cells, color=frac.cells)) +
facet_grid(method~., space="free", scales="free_x") +
scale_color_gradient(high="darkblue", low="white") +
# scale_color_viridis_c() +
theme_bw(base_size = 16) +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=0.5),
strip.text.x = element_text(angle=45))
markers_pl
ggsave(paste0(outdir, "Thymus_markers_accessibility.png"), height = 16, width = 12)
Reproducing Fig.2H on T-cell development
t.cell.markers <- list(known.markers = c("CD34", "IGLL1", "TRGC2", "TRDC", "PTCRA", "TRBC2", "TRAC", "CD4", "CD8A", "CD8B"),
chemokine.receptors = c("CCR9", "CCR7"),
tcr.activation = c("CD5", "CD27"),
proliferation=c("PCNA", "CDK1", "MKI67"),
cyclin.D = c("CCND2", "CCND3"),
recombination=c("RAG1", "RAG2"),
apoptosis=c("HRK","BMF", "TP53INP1"),
stage.markers = c("ST18", "HIVEP3", "RGPD3", "SMPD3", "AQP3", "RORC", "SATB1", "TOX2")
)
t.cell.markers.df <- imap(t.cell.markers, ~ data.frame(gene=.x, cell.type.class=.y)) %>%
purrr::reduce(bind_rows)
Unequal factor levels: coercing to characterbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorUnequal factor levels: coercing to characterbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vector
ordered.tcells <- c("DN", "DP (P)", "DP (Q)","SP (1)")
tcells.markers.df <-
atac.seu@assays$ACTIVITY@data[intersect(unlist(t.cell.markers), rownames(atac.seu@assays$ACTIVITY)),] %>%
as.matrix() %>%
reshape2::melt(varnames=c("gene", "cell"), value.name="log.counts") %>%
full_join(rownames_to_column(atac.seu@meta.data[, label_cols], "cell")) %>%
pivot_longer(cols=label_cols, names_to = "method", values_to = "predicted.id") %>%
dplyr::mutate(method=str_remove(method,".+_")) %>%
filter(method %in% c("CCA", "Liger", "Conos")) %>%
mutate(predicted.id=ifelse(str_detect(predicted.id, "CD8+"), "CD8+T", predicted.id)) %>%
mutate(predicted.id=ifelse(str_detect(predicted.id, "CD4+"), "CD4+T", predicted.id)) %>%
filter(predicted.id %in% ordered.tcells) %>%
group_by(method, predicted.id, gene) %>%
dplyr::mutate(frac.cells=sum(log.counts > 0)/n(), mean.acc=mean(log.counts)) %>%
ungroup()
Joining, by = "cell"
Column `cell` joining factor and character vector, coercing into character vector
tcells.markers.df %>%
full_join(t.cell.markers.df) %>%
# filter(method=="CCA") %>%
mutate(predicted.id=factor(predicted.id, levels=ordered.tcells)) %>%
ggplot(aes( predicted.id, gene)) +
facet_grid(cell.type.class~method, scales = "free_y", space="free") +
geom_point(aes(size=frac.cells, color=mean.acc)) +
scale_color_gradient(high="darkblue", low="white") +
# scale_color_gradient2(midpoint = 0.5) +
theme_bw(base_size = 16) +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=0.5),
strip.text.y = element_text(angle=0))
Joining, by = "gene"
Column `gene` joining factor and character vector, coercing into character vector
ggsave(paste0(outdir, "tcell_markers.png"), height = 14, width = 14)